library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
library(TH.data)
library(psych)
library(whitening)
library("vioplot")
library("rpart")
library(mlbench)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
Source W. Nick Street, Olvi L. Mangasarian and William H. Wolberg (1995). An inductive learning approach to prognostic prediction. In A. Prieditis and S. Russell, editors, Proceedings of the Twelfth International Conference on Machine Learning, pages 522–530, San Francisco, Morgan Kaufmann.
Peter Buehlmann and Torsten Hothorn (2007), Boosting algorithms: regularization, prediction and model fitting. Statistical Science, 22(4), 477–505.
wpbc {TH.data}
data("wpbc", package = "TH.data")
table(wpbc[,"status"])
#>
#> N R
#> 151 47
sum(1*(wpbc[,"status"]=="R" & wpbc$time <= 24))
#> [1] 29
wpbc <- subset(wpbc,time > 36 | status=="R" )
summary(wpbc$time)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 1.00 36.75 60.50 58.79 78.75 125.00
wpbc[,"status"] <- 1*(wpbc[,"status"]=="R")
wpbc <- wpbc[complete.cases(wpbc),]
pander::pander(table(wpbc[,"status"]))
| 0 | 1 |
|---|---|
| 91 | 46 |
wpbc$time <- NULL
studyName <- "Wisconsin"
dataframe <- wpbc
outcome <- "status"
thro <- 0.4
TopVariables <- 10
cexheat = 0.25
Some libraries
library(psych)
library(whitening)
library("vioplot")
library("rpart")
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
| rows | col |
|---|---|
| 137 | 32 |
pander::pander(table(dataframe[,outcome]))
| 0 | 1 |
|---|---|
| 91 | 46 |
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1500
Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples
dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData
numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000
if (!largeSet)
{
hm <- heatMaps(data=dataframeScaled[1:numsub,],
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Pearson Correlation|",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.9961379
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> mean_perimeter mean_texture tsize
#> mean_radius mean_texture mean_perimeter mean_area
#> 0.96875 0.50000 1.00000 0.93750
#> mean_smoothness mean_compactness
#> 0.34375 0.40625
#>
#> Included: 32 , Uni p: 0.0046875 , Base Size: 3 , Rcrit: 0.2212374
#>
#>
1 <R=0.996,thr=0.950>, Top: 3< 2 >[Fa= 3 ]( 3 , 6 , 0 ),<|>Tot Used: 9 , Added: 6 , Zero Std: 0 , Max Cor: 0.922
#>
2 <R=0.922,thr=0.900>, Top: 1< 1 >[Fa= 3 ]( 1 , 1 , 3 ),<|>Tot Used: 9 , Added: 1 , Zero Std: 0 , Max Cor: 0.891
#>
3 <R=0.891,thr=0.800>, Top: 6< 1 >[Fa= 9 ]( 6 , 6 , 3 ),<|>Tot Used: 18 , Added: 6 , Zero Std: 0 , Max Cor: 0.842
#>
4 <R=0.842,thr=0.800>, Top: 1< 1 >[Fa= 10 ]( 1 , 1 , 9 ),<|>Tot Used: 20 , Added: 1 , Zero Std: 0 , Max Cor: 0.789
#>
5 <R=0.789,thr=0.700>, Top: 6< 1 >[Fa= 12 ]( 6 , 6 , 10 ),<|>Tot Used: 25 , Added: 6 , Zero Std: 0 , Max Cor: 0.743
#>
6 <R=0.743,thr=0.700>, Top: 2< 1 >[Fa= 13 ]( 2 , 2 , 12 ),<|>Tot Used: 27 , Added: 2 , Zero Std: 0 , Max Cor: 0.698
#>
7 <R=0.698,thr=0.600>, Top: 3< 2 >[Fa= 13 ]( 3 , 4 , 13 ),<|>Tot Used: 27 , Added: 4 , Zero Std: 0 , Max Cor: 0.768
#>
8 <R=0.768,thr=0.700>, Top: 2< 1 >[Fa= 13 ]( 2 , 2 , 13 ),<|>Tot Used: 27 , Added: 2 , Zero Std: 0 , Max Cor: 0.600
#>
9 <R=0.600,thr=0.600>, Top: 1< 1 >[Fa= 13 ]( 1 , 1 , 13 ),<|>Tot Used: 27 , Added: 1 , Zero Std: 0 , Max Cor: 0.749
#>
10 <R=0.749,thr=0.700>, Top: 1< 1 >[Fa= 13 ]( 1 , 1 , 13 ),<|>Tot Used: 27 , Added: 1 , Zero Std: 0 , Max Cor: 0.600
#>
11 <R=0.600,thr=0.500>, Top: 7< 1 >[Fa= 15 ]( 7 , 8 , 13 ),<|>Tot Used: 28 , Added: 8 , Zero Std: 0 , Max Cor: 0.685
#>
12 <R=0.685,thr=0.600>, Top: 2< 1 >[Fa= 15 ]( 2 , 2 , 15 ),<|>Tot Used: 28 , Added: 2 , Zero Std: 0 , Max Cor: 0.607
#>
13 <R=0.607,thr=0.600>, Top: 1< 1 >[Fa= 15 ]( 1 , 1 , 15 ),<|>Tot Used: 28 , Added: 1 , Zero Std: 0 , Max Cor: 0.580
#>
14 <R=0.580,thr=0.500>, Top: 1< 1 >[Fa= 16 ]( 1 , 1 , 15 ),<|>Tot Used: 28 , Added: 1 , Zero Std: 0 , Max Cor: 0.500
#>
15 <R=0.500,thr=0.400>, Top: 9< 1 >[Fa= 18 ]( 8 , 11 , 16 ),<|>Tot Used: 32 , Added: 11 , Zero Std: 0 , Max Cor: 0.573
#>
16 <R=0.573,thr=0.500>, Top: 2< 1 >[Fa= 18 ]( 2 , 2 , 18 ),<|>Tot Used: 32 , Added: 2 , Zero Std: 0 , Max Cor: 0.548
#>
17 <R=0.548,thr=0.500>, Top: 1< 1 >[Fa= 18 ]( 1 , 1 , 18 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.630
#>
18 <R=0.630,thr=0.600>, Top: 1< 1 >[Fa= 19 ]( 1 , 1 , 18 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.493
#>
19 <R=0.493,thr=0.400>, Top: 5< 1 >[Fa= 20 ]( 5 , 6 , 19 ),<|>Tot Used: 32 , Added: 6 , Zero Std: 0 , Max Cor: 0.512
#>
20 <R=0.512,thr=0.500>, Top: 1< 1 >[Fa= 21 ]( 1 , 1 , 20 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.469
#>
21 <R=0.469,thr=0.400>, Top: 7< 1 >[Fa= 22 ]( 4 , 4 , 21 ),<|>Tot Used: 32 , Added: 4 , Zero Std: 0 , Max Cor: 0.501
#>
22 <R=0.501,thr=0.500>, Top: 1< 1 >[Fa= 22 ]( 1 , 1 , 22 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.488
#>
23 <R=0.488,thr=0.400>, Top: 2< 1 >[Fa= 22 ]( 2 , 2 , 22 ),<|>Tot Used: 32 , Added: 2 , Zero Std: 0 , Max Cor: 0.398
#>
24 <R=0.398,thr=0.400>
#>
[ 24 ], 0.3981401 Decor Dimension: 32 Nused: 32 . Cor to Base: 24 , ABase: 32 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
515156
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
6371
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
1.39
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
1.3
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPLTM <- attr(DEdataframe,"UPLTM")
gplots::heatmap.2(1.0*(abs(UPLTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}
Displaying the features associations
par(op)
if (ncol(dataframe) < 1000)
{
DEdataframeB <- ILAA(dataframe,verbose=TRUE,thr=thro,bootstrap=30)
transform <- 1*(attr(DEdataframeB,"UPLTM") != 0)
print(ncol(transform))
thrcol <- 1 + 0.025*nrow(transform)
rsum <- apply(1*(transform !=0),1,sum) > 2
csum <- apply(1*(transform !=0),2,sum) > thrcol | rsum
transform <- transform[csum,csum]
csum <- (apply(1*(transform !=0),2,sum) > 1) & (apply(1*(transform !=0),1,sum) > 1)
transform <- transform[csum,csum]
print(ncol(transform))
if (ncol(transform)>100)
{
thrcol <- 1 + 0.10*nrow(transform)
rsum <- apply(1*(transform !=0),1,sum) > 4
csum <- apply(1*(transform !=0),2,sum) > thrcol | rsum
transform <- transform[csum,csum]
csum <- (apply(1*(transform !=0),2,sum) > 3) & (apply(1*(transform !=0),1,sum) > 3)
transform <- transform[csum,csum]
}
print(ncol(transform))
if (ncol(transform)>100)
{
thrcol <- 1 + 0.20*nrow(transform)
rsum <- apply(1*(transform !=0),1,sum) > 8
csum <- apply(1*(transform !=0),2,sum) > thrcol | rsum
transform <- transform[csum,csum]
csum <- (apply(1*(transform !=0),2,sum) > 7) & (apply(1*(transform !=0),1,sum) > 7)
transform <- transform[csum,csum]
}
print(ncol(transform))
if ((ncol(transform) > 10) && (ncol(transform) < 150))
{
gplots::heatmap.2(transform,
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Red Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
colnames(transform) <- str_remove_all(colnames(transform),"La_")
VertexSize <- apply(transform,2,mean)
VertexSize <- 5*VertexSize/max(VertexSize)
gr <- graph_from_adjacency_matrix(transform,mode = "directed",diag = FALSE,weighted=TRUE)
gr$layout <- layout_with_fr
fc <- cluster_optimal(gr)
plot(fc, gr,
edge.width = 0.5*E(gr)$weight,
vertex.size=VertexSize,
edge.arrow.size=0.5,
edge.arrow.width=0.5,
vertex.label.cex=0.65,
vertex.label.dist=1,
main="Feature Association")
}
}
#> fast | LM |
#> mean_perimeter mean_texture tsize
#> mean_radius mean_texture mean_perimeter mean_area
#> 0.96875 0.50000 1.00000 0.93750
#> mean_smoothness mean_compactness
#> 0.34375 0.40625
#>
#> Included: 32 , Uni p: 0.0046875 , Base Size: 3 , Rcrit: 0.2212374
#>
#>
1 <R=0.996,thr=0.950>, Top: 3< 2 >[Fa= 3 ]( 3 , 6 , 0 ),<|>Tot Used: 9 , Added: 6 , Zero Std: 0 , Max Cor: 0.922
#>
2 <R=0.922,thr=0.900>, Top: 1< 1 >[Fa= 3 ]( 1 , 1 , 3 ),<|>Tot Used: 9 , Added: 1 , Zero Std: 0 , Max Cor: 0.891
#>
3 <R=0.891,thr=0.800>, Top: 6< 1 >[Fa= 9 ]( 6 , 6 , 3 ),<|>Tot Used: 18 , Added: 6 , Zero Std: 0 , Max Cor: 0.842
#>
4 <R=0.842,thr=0.800>, Top: 1< 1 >[Fa= 10 ]( 1 , 1 , 9 ),<|>Tot Used: 20 , Added: 1 , Zero Std: 0 , Max Cor: 0.789
#>
5 <R=0.789,thr=0.700>, Top: 6< 1 >[Fa= 12 ]( 6 , 6 , 10 ),<|>Tot Used: 25 , Added: 6 , Zero Std: 0 , Max Cor: 0.743
#>
6 <R=0.743,thr=0.700>, Top: 2< 1 >[Fa= 13 ]( 2 , 2 , 12 ),<|>Tot Used: 27 , Added: 2 , Zero Std: 0 , Max Cor: 0.698
#>
7 <R=0.698,thr=0.600>, Top: 3< 2 >[Fa= 13 ]( 3 , 4 , 13 ),<|>Tot Used: 27 , Added: 4 , Zero Std: 0 , Max Cor: 0.768
#>
8 <R=0.768,thr=0.700>, Top: 2< 1 >[Fa= 13 ]( 2 , 2 , 13 ),<|>Tot Used: 27 , Added: 2 , Zero Std: 0 , Max Cor: 0.600
#>
9 <R=0.600,thr=0.600>, Top: 1< 1 >[Fa= 13 ]( 1 , 1 , 13 ),<|>Tot Used: 27 , Added: 1 , Zero Std: 0 , Max Cor: 0.749
#>
10 <R=0.749,thr=0.700>, Top: 1< 1 >[Fa= 13 ]( 1 , 1 , 13 ),<|>Tot Used: 27 , Added: 1 , Zero Std: 0 , Max Cor: 0.600
#>
11 <R=0.600,thr=0.500>, Top: 7< 1 >[Fa= 15 ]( 7 , 8 , 13 ),<|>Tot Used: 28 , Added: 8 , Zero Std: 0 , Max Cor: 0.685
#>
12 <R=0.685,thr=0.600>, Top: 2< 1 >[Fa= 15 ]( 2 , 2 , 15 ),<|>Tot Used: 28 , Added: 2 , Zero Std: 0 , Max Cor: 0.607
#>
13 <R=0.607,thr=0.600>, Top: 1< 1 >[Fa= 15 ]( 1 , 1 , 15 ),<|>Tot Used: 28 , Added: 1 , Zero Std: 0 , Max Cor: 0.580
#>
14 <R=0.580,thr=0.500>, Top: 1< 1 >[Fa= 16 ]( 1 , 1 , 15 ),<|>Tot Used: 28 , Added: 1 , Zero Std: 0 , Max Cor: 0.500
#>
15 <R=0.500,thr=0.400>, Top: 9< 1 >[Fa= 18 ]( 8 , 11 , 16 ),<|>Tot Used: 32 , Added: 11 , Zero Std: 0 , Max Cor: 0.573
#>
16 <R=0.573,thr=0.500>, Top: 2< 1 >[Fa= 18 ]( 2 , 2 , 18 ),<|>Tot Used: 32 , Added: 2 , Zero Std: 0 , Max Cor: 0.548
#>
17 <R=0.548,thr=0.500>, Top: 1< 1 >[Fa= 18 ]( 1 , 1 , 18 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.630
#>
18 <R=0.630,thr=0.600>, Top: 1< 1 >[Fa= 19 ]( 1 , 1 , 18 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.493
#>
19 <R=0.493,thr=0.400>, Top: 5< 1 >[Fa= 20 ]( 5 , 6 , 19 ),<|>Tot Used: 32 , Added: 6 , Zero Std: 0 , Max Cor: 0.512
#>
20 <R=0.512,thr=0.500>, Top: 1< 1 >[Fa= 21 ]( 1 , 1 , 20 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.469
#>
21 <R=0.469,thr=0.400>, Top: 7< 1 >[Fa= 22 ]( 4 , 4 , 21 ),<|>Tot Used: 32 , Added: 4 , Zero Std: 0 , Max Cor: 0.501
#>
22 <R=0.501,thr=0.500>, Top: 1< 1 >[Fa= 22 ]( 1 , 1 , 22 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.488
#>
23 <R=0.488,thr=0.400>, Top: 2< 1 >[Fa= 22 ]( 2 , 2 , 22 ),<|>Tot Used: 32 , Added: 2 , Zero Std: 0 , Max Cor: 0.398
#>
24 <R=0.398,thr=0.400>
#>
[ 24 ], 0.3981401 Decor Dimension: 32 Nused: 32 . Cor to Base: 24 , ABase: 32 , Outcome Base: 0
#>
bootstrapping->..............................
#>
[1] 32
#> [1] 27
#> [1] 27
#> [1] 27
par(op)
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after ILAA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Pearson Correlation|",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.3981401
if (nrow(dataframe) < 1000)
{
classes <- unique(dataframe[1:numsub,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}
if (nrow(dataframe) < 1000)
{
datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After ILAA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##top variables
topvar <- c(1:length(varlist)) <= TopVariables
tableRaw <- univarRAW$orderframe[topvar,univariate_columns]
pander::pander(tableRaw)
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| tsize | 3.47 | 2.03 | 2.64 | 1.86 | 1.11e-03 | 0.666 |
| pnodes | 4.87 | 6.02 | 2.63 | 5.21 | 6.25e-09 | 0.650 |
| worst_radius | 22.67 | 4.70 | 20.35 | 4.08 | 3.68e-01 | 0.647 |
| worst_perimeter | 151.33 | 32.42 | 135.34 | 26.85 | 5.71e-01 | 0.645 |
| mean_area | 1081.98 | 397.26 | 888.40 | 310.85 | 1.26e-01 | 0.645 |
| worst_area | 1635.77 | 703.15 | 1317.95 | 550.94 | 2.72e-01 | 0.643 |
| mean_perimeter | 121.10 | 22.91 | 110.02 | 19.19 | 4.72e-01 | 0.641 |
| mean_radius | 18.33 | 3.37 | 16.70 | 2.91 | 3.12e-01 | 0.639 |
| SE_perimeter | 4.73 | 2.21 | 3.81 | 1.80 | 6.37e-02 | 0.634 |
| SE_area | 81.97 | 53.36 | 61.22 | 37.72 | 6.46e-02 | 0.632 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
pander::pander(finalTable)
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| tsize | 3.47174 | 2.02985 | 2.63846 | 1.85507 | 0.00111 | 0.666 |
| La_worst_area | 409.10338 | 59.42237 | 440.08923 | 60.65702 | 0.52089 | 0.653 |
| La_SE_symmetry | -0.00356 | 0.00448 | -0.00579 | 0.00572 | 0.23435 | 0.645 |
| mean_perimeter | 121.09522 | 22.91019 | 110.02231 | 19.18940 | 0.47168 | 0.641 |
| La_worst_fractaldim | -0.09538 | 0.00580 | -0.09896 | 0.00862 | 0.93594 | 0.639 |
| La_mean_smoothness | 0.09341 | 0.00508 | 0.09102 | 0.00555 | 0.90681 | 0.626 |
| La_mean_fractaldim | 0.08522 | 0.00324 | 0.08699 | 0.00471 | 0.35987 | 0.600 |
| La_worst_perimeter | -5.80502 | 11.69785 | -7.42934 | 11.47409 | 0.02995 | 0.577 |
| La_SE_concavity | -0.01260 | 0.00681 | -0.01037 | 0.00581 | 0.22238 | 0.573 |
| La_worst_concavity | 0.14327 | 0.09717 | 0.11997 | 0.08164 | 0.81386 | 0.572 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
| mean | total | fraction |
|---|---|---|
| 4.79 | 29 | 0.906 |
theCharformulas <- attr(dc,"LatentCharFormulas")
finalTable <- rbind(finalTable,tableRaw[topvar[!(topvar %in% topLAvar)],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- theCharformulas[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| DecorFormula | caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | RAWAUC | fscores | |
|---|---|---|---|---|---|---|---|---|---|
| tsize | NA | 3.47e+00 | 2.03e+00 | 2.64e+00 | 1.86e+00 | 1.11e-03 | 0.666 | 0.666 | 1 |
| tsize1 | NA | 3.47e+00 | 2.03e+00 | 2.64e+00 | 1.86e+00 | 1.11e-03 | 0.666 | NA | NA |
| La_worst_area | + (2.56e+02)mean_radius - (4.838)mean_perimeter - (2.009)mean_area - (1.39e+02)worst_radius + (0.035)worst_perimeter + worst_area | 4.09e+02 | 5.94e+01 | 4.40e+02 | 6.07e+01 | 5.21e-01 | 0.653 | 0.643 | -2 |
| pnodes | NA | 4.87e+00 | 6.02e+00 | 2.63e+00 | 5.21e+00 | 6.25e-09 | 0.650 | 0.650 | NA |
| worst_radius | NA | 2.27e+01 | 4.70e+00 | 2.03e+01 | 4.08e+00 | 3.68e-01 | 0.647 | 0.647 | NA |
| worst_perimeter | NA | 1.51e+02 | 3.24e+01 | 1.35e+02 | 2.68e+01 | 5.71e-01 | 0.645 | 0.645 | NA |
| La_SE_symmetry | - (2.99e-03)mean_radius + (5.30e-04)mean_perimeter + (3.57e-03)mean_symmetry - (0.376)mean_fractaldim + SE_symmetry - (2.978)SE_fractaldim - (3.80e-03)worst_compactness - (0.099)worst_symmetry + (0.375)worst_fractaldim | -3.56e-03 | 4.48e-03 | -5.79e-03 | 5.72e-03 | 2.34e-01 | 0.645 | 0.504 | -4 |
| mean_area | NA | 1.08e+03 | 3.97e+02 | 8.88e+02 | 3.11e+02 | 1.26e-01 | 0.645 | 0.645 | NA |
| worst_area | NA | 1.64e+03 | 7.03e+02 | 1.32e+03 | 5.51e+02 | 2.72e-01 | 0.643 | 0.643 | NA |
| mean_perimeter | NA | 1.21e+02 | 2.29e+01 | 1.10e+02 | 1.92e+01 | 4.72e-01 | 0.641 | 0.641 | 9 |
| mean_perimeter1 | NA | 1.21e+02 | 2.29e+01 | 1.10e+02 | 1.92e+01 | 4.72e-01 | 0.641 | NA | NA |
| La_worst_fractaldim | - (0.029)mean_radius + (4.31e-03)mean_perimeter - (2.460)mean_fractaldim - (0.071)worst_compactness + worst_fractaldim | -9.54e-02 | 5.80e-03 | -9.90e-02 | 8.62e-03 | 9.36e-01 | 0.639 | 0.583 | -1 |
| mean_radius | NA | 1.83e+01 | 3.37e+00 | 1.67e+01 | 2.91e+00 | 3.12e-01 | 0.639 | 0.639 | NA |
| SE_perimeter | NA | 4.73e+00 | 2.21e+00 | 3.81e+00 | 1.80e+00 | 6.37e-02 | 0.634 | 0.634 | NA |
| SE_area | NA | 8.20e+01 | 5.34e+01 | 6.12e+01 | 3.77e+01 | 6.46e-02 | 0.632 | 0.632 | NA |
| La_mean_smoothness | - (0.012)mean_radius + (2.32e-03)mean_perimeter + mean_smoothness - (0.411)mean_concavepoints - (0.521)mean_fractaldim | 9.34e-02 | 5.08e-03 | 9.10e-02 | 5.55e-03 | 9.07e-01 | 0.626 | 0.518 | -1 |
| La_mean_fractaldim | + (0.019)mean_radius - (2.69e-03)mean_perimeter + mean_fractaldim | 8.52e-02 | 3.24e-03 | 8.70e-02 | 4.71e-03 | 3.60e-01 | 0.600 | 0.615 | 5 |
| La_worst_perimeter | - (1.298)mean_perimeter + worst_perimeter | -5.81e+00 | 1.17e+01 | -7.43e+00 | 1.15e+01 | 2.99e-02 | 0.577 | 0.645 | 2 |
| La_SE_concavity | - (0.766)SE_compactness + SE_concavity - (1.245)SE_concavepoints + (0.455)SE_fractaldim + (0.064)worst_compactness - (0.077)worst_concavity | -1.26e-02 | 6.81e-03 | -1.04e-02 | 5.81e-03 | 2.22e-01 | 0.573 | 0.478 | -2 |
| La_worst_concavity | - (0.830)worst_compactness + worst_concavity | 1.43e-01 | 9.72e-02 | 1.20e-01 | 8.16e-02 | 8.14e-01 | 0.572 | 0.492 | 1 |
featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE) #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous])
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)
#pander::pander(pc$rotation)
PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])
gplots::heatmap.2(abs(PCACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "PCA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
EFAdataframe <- dataframeScaled
if (length(iscontinous) < 2000)
{
topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
if (topred < 2) topred <- 2
uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE) # EFA analysis
predEFA <- predict(uls,dataframeScaled[,iscontinous])
EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous])
EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
gplots::heatmap.2(abs(EFACor),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "EFA Correlation",
cexRow = 0.5,
cexCol = 0.5,
srtCol=45,
srtRow= -45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
}
par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(rawmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
}
pander::pander(table(dataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 68 | 23 |
| 1 | 9 | 37 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.766 | 0.687 | 0.834 |
| 3 | se | 0.804 | 0.661 | 0.906 |
| 4 | sp | 0.747 | 0.645 | 0.833 |
| 6 | diag.or | 12.155 | 5.100 | 28.966 |
par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(IDeAmodel,main="ILAA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(IDeAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
}
pander::pander(table(DEdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 89 | 2 |
| 1 | 33 | 13 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.745 | 0.663 | 0.815 |
| 3 | se | 0.283 | 0.160 | 0.435 |
| 4 | sp | 0.978 | 0.923 | 0.997 |
| 6 | diag.or | 17.530 | 3.753 | 81.883 |
par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(PCAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}
pander::pander(table(PCAdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 86 | 5 |
| 1 | 27 | 19 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.766 | 0.687 | 0.834 |
| 3 | se | 0.413 | 0.270 | 0.568 |
| 4 | sp | 0.945 | 0.876 | 0.982 |
| 6 | diag.or | 12.104 | 4.128 | 35.493 |
par(op)
EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(EFAmodel,EFAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
text(EFAmodel, use.n = TRUE,cex=0.75)
ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
}
pander::pander(table(EFAdataframe[,outcome],pr))
| 0 | 1 | |
|---|---|---|
| 0 | 90 | 1 |
| 1 | 38 | 8 |
pander::pander(ptab$detail[c(5,3,4,6),])
| statistic | est | lower | upper | |
|---|---|---|---|---|
| 5 | diag.ac | 0.715 | 0.6320 | 0.789 |
| 3 | se | 0.174 | 0.0782 | 0.314 |
| 4 | sp | 0.989 | 0.9403 | 1.000 |
| 6 | diag.or | 18.947 | 2.2899 | 156.776 |
par(op)